{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Integration with sklearn pipelines\n",
    "\n",
    "In this notebook, provide some illustration for integration with sklearn pipelines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import keras\n",
    "import imblearn\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import smote_variants as sv\n",
    "import imblearn.datasets as imb_datasets\n",
    "\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "random_seed= 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(random_seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "libras= imb_datasets.fetch_datasets()['libras_move']\n",
    "X, y= libras['data'], libras['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.33)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fitting a pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "oversampler= sv.MulticlassOversampling(sv.distance_SMOTE())\n",
    "classifier= KNeighborsClassifier(n_neighbors= 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model= Pipeline([('scale', StandardScaler()), ('clf', sv.OversamplingClassifier(oversampler, classifier))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-07-28 15:48:30,348:INFO:MulticlassOversampling: Running multiclass oversampling with strategy equalize_1_vs_many_successive\n",
      "2019-07-28 15:48:30,350:INFO:MulticlassOversampling: Sampling minority class with label: 1\n",
      "2019-07-28 15:48:30,352:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\")\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', OversamplingClassifier(classifier=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "           weights='uniform'),\n",
       "            oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x7fa8d717ee48>))])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grid search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid= {'clf__oversampler':[sv.distance_SMOTE(proportion=0.5),\n",
    "                                 sv.distance_SMOTE(proportion=1.0),\n",
    "                                 sv.distance_SMOTE(proportion=1.5)]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid= GridSearchCV(model, param_grid= param_grid, cv= 3, n_jobs= 1, verbose= 2, scoring= 'accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
      "2019-07-28 15:48:30,993:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s\n",
      "2019-07-28 15:48:31,021:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,043:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,065:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,091:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,119:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,147:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "2019-07-28 15:48:31,181:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 3 candidates, totalling 9 fits\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-07-28 15:48:31,216:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n",
      "[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished\n",
      "2019-07-28 15:48:31,253:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', \"{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1}\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n",
      "[CV] clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\") \n",
      "[CV]  clf__oversampler=('distance_SMOTE', \"{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1}\"), total=   0.0s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise-deprecating',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', OversamplingClassifier(classifier=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "           weights='uniform'),\n",
       "            oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x7fa8d717ee48>))]),\n",
       "       fit_params=None, iid='warn', n_jobs=1,\n",
       "       param_grid={'clf__oversampler': [<smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>, <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>, <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='accuracy', verbose=2)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9222222222222223\n",
      "{'mean_fit_time': array([0.00849088, 0.01005777, 0.01242367]), 'std_fit_time': array([0.00347365, 0.00070242, 0.0003146 ]), 'mean_score_time': array([0.0064021 , 0.00667214, 0.00908041]), 'std_score_time': array([0.0005612 , 0.00025586, 0.00024809]), 'param_clf__oversampler': masked_array(data=[<smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>,\n",
      "                   <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>,\n",
      "                   <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>],\n",
      "             mask=[False, False, False],\n",
      "       fill_value='?',\n",
      "            dtype=object), 'params': [{'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5c0>}, {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e5f8>}, {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x7fa8d719e630>}], 'split0_test_score': array([0.83333333, 0.80833333, 0.81666667]), 'split1_test_score': array([0.96666667, 0.94166667, 0.925     ]), 'split2_test_score': array([0.96666667, 0.96666667, 0.96666667]), 'mean_test_score': array([0.92222222, 0.90555556, 0.90277778]), 'std_test_score': array([0.06285394, 0.06949998, 0.06322115]), 'rank_test_score': array([1, 2, 3], dtype=int32), 'split0_train_score': array([1., 1., 1.]), 'split1_train_score': array([0.98333333, 0.97916667, 0.975     ]), 'split2_train_score': array([0.98333333, 0.97916667, 0.97916667]), 'mean_train_score': array([0.98888889, 0.98611111, 0.98472222]), 'std_train_score': array([0.00785674, 0.00982093, 0.01093612])}\n"
     ]
    }
   ],
   "source": [
    "print(grid.best_score_)\n",
    "print(grid.cv_results_)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
